import pandas as pd
import numpy as np

# 读取csv文件并将其转换为pandas DataFrame对象
df = pd.read_csv('order_train1.csv')

# 查看数据的前几行以及基本信息
print(df.head())
print(df.info())

# 删除缺失值
df.dropna(inplace=True)

# 将销售渠道名称列转换为类别类型，并将其编码为整数
df['sales_chan_name'] = df['sales_chan_name'].astype('category').cat.codes


# 重命名列名并按订单日期升序排序
df.rename(columns={'ord_qty':'order_quantity',
                   'item_price':'unit_price'},
          inplace=True)
df.sort_values('order_date', ascending=True, inplace=True)

# 构造新的特征：总价(total_price)
df['total_price'] = df['order_quantity'] * df['unit_price']

# 将订单日期列转换为datetime格式，并提取日期、月份和年份作为新的特征
df['order_date'] = pd.to_datetime(df['order_date'])
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
df['day'] = df['order_date'].dt.day
df['weekday'] = df['order_date'].dt.weekday

# 保存数据预处理结果到新的csv文件中
df.to_csv('processed_order_train1.csv', index=False)
